{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d389a585",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '0'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "648c4229",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/mesolitica/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from accelerate import init_empty_weights, load_checkpoint_and_dispatch\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer\n",
    "import safetensors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d20eeedf",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-14B')\n",
    "streamer = TextStreamer(tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "db9a0a63",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-06-01 16:49:52,583] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/bin/ld: cannot find -laio: No such file or directory\n",
      "collect2: error: ld returned 1 exit status\n",
      "/usr/bin/ld: cannot find -lcufile: No such file or directory\n",
      "collect2: error: ld returned 1 exit status\n",
      "Loading checkpoint shards: 100%|██████████████████████████████████| 8/8 [00:05<00:00,  1.44it/s]\n"
     ]
    }
   ],
   "source": [
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    'Qwen/Qwen3-14B', torch_dtype = 'auto',\n",
    "    device_map=\"auto\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "91411eaa",
   "metadata": {},
   "outputs": [],
   "source": [
    "state_dict = model.state_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e17aa975",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "checkpoint-1000  checkpoint-1100  checkpoint-900\r\n"
     ]
    }
   ],
   "source": [
    "!ls lora-embedding-256-qwen3-14b-malaysian-8k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "5863d9cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "from safetensors import safe_open\n",
    "\n",
    "folder = 'lora-embedding-256-qwen3-14b-malaysian-8k/checkpoint-1100'\n",
    "\n",
    "f = safe_open(f\"{folder}/adapter_model.safetensors\", framework=\"pt\", device='cpu')\n",
    "keys = f.keys()\n",
    "keys = sorted(list(set([k.split('.lora')[0] for k in keys if '.lora' in k])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "05daf993",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████| 282/282 [00:00<00:00, 548.56it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "for k in tqdm(keys):\n",
    "    k_ori = k.replace('base_model.model.', '') + '.weight'\n",
    "    if 'embed_tokens' in k:\n",
    "        post_A = '.lora_embedding_A'\n",
    "        post_B = '.lora_embedding_B'\n",
    "    else:\n",
    "        post_A = '.lora_A.weight'\n",
    "        post_B = '.lora_B.weight'\n",
    "    A = k + post_A\n",
    "    B = k + post_B\n",
    "    \n",
    "    W = state_dict[k_ori]\n",
    "    if 'embed_tokens' not in k:\n",
    "        W = W.t()\n",
    "        \n",
    "    A = f.get_tensor(A).to(W.device)\n",
    "    B = f.get_tensor(B).to(W.device)\n",
    "    with torch.no_grad():\n",
    "        W.addmm_(A.t(), B.t(), alpha = 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "fddbfdc0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<|im_start|>user\n",
      "\"butoh hang\" tu apa dalam bahsa kedah<|im_end|>\n",
      "<|im_start|>assistant\n",
      "<think>\n",
      "\n",
      "</think>\n",
      "\n",
      "Miskin atau takde duit. \"Butoh hang\" adalah ungkapan dialek Kedah yang digunakan untuk menggambarkan keadaan seseorang yang kekurangan wang atau dalam keadaan miskin. \"Butoh\" bermaksud tidak ada atau kurang, manakala \"hang\" adalah kata ganti nama diri kedua yang setara dengan \"anda\" atau \"engkau\" dalam Bahasa Melayu standard. Ungkapan ini biasanya digunakan dalam percakapan harian untuk menyatakan keadaan kewangan s\n"
     ]
    }
   ],
   "source": [
    "d = [\n",
    "    {'role': 'user', 'content': '\"butoh hang\" tu apa dalam bahsa kedah'}\n",
    "]\n",
    "\n",
    "inputs = tokenizer.apply_chat_template(d, return_tensors = 'pt', add_generation_prompt=True, enable_thinking = False).to('cuda')\n",
    "generate_kwargs = dict(\n",
    "    input_ids=inputs,\n",
    "    max_new_tokens=128,\n",
    "    top_p=0.95,\n",
    "    top_k=50,\n",
    "    temperature=0.9,\n",
    "    do_sample=True,\n",
    "    repetition_penalty=1.05,\n",
    "    streamer=streamer\n",
    ")\n",
    "generation_output = model.generate(**generate_kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "857a6514",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<|im_start|>system\n",
      "You are going to enter reasoning mode, first you try to think step-by-step in malay after that give the final answer.<|im_end|>\n",
      "<|im_start|>user\n",
      "\"Awok ni kene bebele dengan dokte, bomo tok jalan.\" terjemah ke malay<|im_end|>\n",
      "<|im_start|>assistant\n",
      "<think>\n",
      "\n",
      "</think>\n",
      "\n",
      "\"Dia perlu mendapatkan nasihat daripada doktor, seorang pekerja pejabat tidak boleh melakukan itu.\"<|im_end|>\n"
     ]
    }
   ],
   "source": [
    "d = [\n",
    "    {'role': 'system', 'content': 'You are going to enter reasoning mode, first you try to think step-by-step in malay after that give the final answer.'},\n",
    "    {'role': 'user', 'content': '\"Awok ni kene bebele dengan dokte, bomo tok jalan.\" terjemah ke malay'},\n",
    "]\n",
    "\n",
    "inputs = tokenizer.apply_chat_template(d, return_tensors = 'pt').to('cuda')\n",
    "generate_kwargs = dict(\n",
    "    input_ids=inputs,\n",
    "    max_new_tokens=4096,\n",
    "    top_p=0.95,\n",
    "    top_k=50,\n",
    "    temperature=0.9,\n",
    "    do_sample=True,\n",
    "    repetition_penalty=1.05,\n",
    "    streamer=streamer\n",
    ")\n",
    "generation_output = model.generate(**generate_kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "ad9b8e43",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<|im_start|>system\n",
      "You are going to enter reasoning mode, first you try to think step-by-step in malay after that give the final answer.<|im_end|>\n",
      "<|im_start|>user\n",
      "camne nk code softmax dalam cuda<|im_end|>\n",
      "<|im_start|>assistant\n",
      "<think>\n",
      "\n",
      "</think>\n",
      "\n",
      "Untuk melaksanakan fungsi softmax dalam CUDA, anda boleh mengikuti langkah-langkah berikut:\n",
      "\n",
      "1. Tentukan kernel untuk melakukan operasi softmax pada GPU.\n",
      "\n",
      "```c++\n",
      "__global__ void softmax_kernel(float *d_input, float *d_output, int N) {\n",
      "    int index = threadIdx.x + blockIdx.x * blockDim.x;\n",
      "    float max_val = -INFINITY;\n",
      "\n",
      "    // Cari nilai maksimum dalam baris\n",
      "    if (index < N) {\n",
      "        for (int i = 0; i < N; i++) {\n",
      "            if (d_input[index * N + i] > max_val) {\n",
      "                max_val = d_input[index * N + i];\n",
      "            }\n",
      "        }\n",
      "    }\n",
      "\n",
      "    // Terapkan fungsi softmax\n",
      "    if (index < N) {\n",
      "        for (int i = 0; i < N; i++) {\n",
      "            d_output[index * N + i] = expf(d_input[index * N + i] - max_val);\n",
      "            for (int j = 0; j < N; j++) {\n",
      "                d_output[index * N + i] += expf(d_input[j * N + i] - max_val);\n",
      "            }\n",
      "            d_output[index * N + i] = logf(d_output[index * N + i]);\n",
      "        }\n",
      "    }\n",
      "}\n",
      "```\n",
      "\n",
      "2. Salin data dari hos (CPU) ke peranti (GPU).\n",
      "\n",
      "```c++\n",
      "cudaMemcpy(d_input, h_input, sizeof(float) * N * N, cudaMemcpyHostToDevice);\n",
      "```\n",
      "\n",
      "3. Lakukan softmax menggunakan kernel yang telah ditakrifkan sebelumnya.\n",
      "\n",
      "```c++\n",
      "int threadsPerBlock = 256;\n",
      "int blocksPerGrid = ceil(N / static_cast<float>(threadsPerBlock));\n",
      "softmax_kernel<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, N);\n",
      "```\n",
      "\n",
      "4. Salin hasil kembali ke hos.\n",
      "\n",
      "```c++\n",
      "cudaMemcpy(h_output, d_output, sizeof(float) * N * N, cudaMemcpyDeviceToHost);\n",
      "```\n",
      "\n",
      "Pastikan untuk mengendalikan sebarang kesilapan yang mungkin berlaku semasa menyalin menjalankan kernel, serta membersihkan memori yang tidak perlu.\n",
      "\n",
      "Adalah penting untuk menyesuaikan dimensi blok dan grid untuk sesuai dengan senibina GPU anda untuk mencapai prestasi optimum. anda juga harus mempertimbangkan untuk menggunakan algorit\n"
     ]
    }
   ],
   "source": [
    "d = [\n",
    "    {'role': 'system', 'content': 'You are going to enter reasoning mode, first you try to think step-by-step in malay after that give the final answer.'},\n",
    "    {'role': 'user', 'content': 'camne nk code softmax dalam cuda'},\n",
    "]\n",
    "\n",
    "inputs = tokenizer.apply_chat_template(d, return_tensors = 'pt').to('cuda')\n",
    "generate_kwargs = dict(\n",
    "    input_ids=inputs,\n",
    "    max_new_tokens=512,\n",
    "    top_p=0.95,\n",
    "    top_k=50,\n",
    "    temperature=0.9,\n",
    "    do_sample=True,\n",
    "    repetition_penalty=1.05,\n",
    "    streamer=streamer\n",
    ")\n",
    "generation_output = model.generate(**generate_kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "256ed0fe",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No files have been modified since last commit. Skipping to prevent empty commit.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/Malaysian-Qwen3-14B/commit/49ce6bdb69219d86793a923c13622a90dfa0c003', commit_message='Upload tokenizer', commit_description='', oid='49ce6bdb69219d86793a923c13622a90dfa0c003', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mesolitica/Malaysian-Qwen3-14B', endpoint='https://huggingface.co', repo_type='model', repo_id='mesolitica/Malaysian-Qwen3-14B'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.push_to_hub('mesolitica/Malaysian-Qwen3-14B')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "90360cab",
   "metadata": {},
   "outputs": [],
   "source": [
    "# model.push_to_hub('mesolitica/Malaysian-Qwen3-14B')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "f503f5c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_pretrained('./14b')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "17fd1235",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Start hashing 9 files.\n",
      "Finished hashing 9 files.\n",
      "Uploading files using Xet Storage..\n",
      "Uploading...: 100%|████████████████████████| 29.5G/29.5G [07:29<00:00, 65.7MB/s]\n",
      "https://huggingface.co/mesolitica/Malaysian-Qwen3-14B/tree/main/.\n"
     ]
    }
   ],
   "source": [
    "!huggingface-cli upload mesolitica/Malaysian-Qwen3-14B ./14b ."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
